Third Week: Exploratory Data Analysis

LaLiga Analysis

Shabnam Sheikhha (95105679)

2018-02-27 17:36:52

Third Assignment

At first, I have made few changes to the spain data frame and made up a few new ones that i will frequently visit to answer the following questions.

Final results of each season including rank, score, total goals, etc.

rbind(
  spain %>%
    select(Season, team = home, opp = visitor, GF = hgoal, GA = vgoal),
  spain %>%
    select(Season, team = visitor, opp = home, GF = vgoal, GA = hgoal)
) %>% 
  mutate(GD = GF-GA) %>% 
  group_by(team,Season) %>% 
  summarize(GP = n(),
            goalsF = sum(GF),
            goalsA = sum(GA),
            goaldif = sum(GD),
            W = sum(GD>0),
            D = sum(GD==0),
            L = sum(GD<0)
  ) %>% 
  mutate(score = W * 3 + D) %>%
  arrange(Season,desc(score),desc(goaldif), desc(goalsF)) %>% 
  group_by(Season) %>% 
  mutate(rank = row_number(-score) %>% as.integer()) -> laliga_fullseason

Results of each half season including rank, score, total goals, etc.

rbind(
  spain %>%
    select(Season, team = home, opp = visitor, GF = hgoal, GA = vgoal),
  spain %>%
    select(Season, team = visitor, opp = home, GF = vgoal, GA = hgoal)
) %>% 
  arrange(Season) %>%
  group_by(Season) %>%
  filter((row_number(Season) <= max(row_number(Season)) / 4) | 
           (row_number(Season) > max(row_number(Season)) / 2 & 
              row_number(Season) <= 3 * max(row_number(Season)) / 4)) %>%
  mutate(GD = GF-GA) %>% 
  group_by(team,Season) %>% 
  summarize(GP = n(),
            goalsF = sum(GF),
            goalsA = sum(GA),
            goaldif = sum(GD),
            W = sum(GD>0),
            D = sum(GD==0),
            L = sum(GD<0)
  ) %>% 
  mutate(score = W * 3 + D) %>%
  arrange(Season,desc(score), goaldif) %>% 
  group_by(Season) %>% 
  mutate(rank = row_number(Season) %>% as.integer()) -> laliga_halfseason

Weekly results of season including rank, score, total goals, etc.

spain %>%
  group_by(Season) %>%
  mutate(teamnum = n_distinct(home)) %>%
  arrange(Date) %>%
  mutate(week = (row_number(Date) - 1) %/% (teamnum / 2) + 1) -> divweek


rbind(
  divweek %>%
    select(week, Date, teamnum, Season, team = home, opp = visitor, GF = hgoal, GA = vgoal, FT),
  divweek %>%
    select(week, Date, teamnum, Season, team = visitor, opp = home, GF = vgoal, GA = hgoal, FT)
) %>% 
  mutate(GD = GF-GA) %>%  
  mutate(W = ifelse(GD > 0,1,0),
         D = ifelse(GD == 0,1,0),
         L = ifelse(GD < 0,1,0)
  ) %>%
  arrange(Season, week) %>%
  group_by(Season, team) %>% 
  mutate(goalsF = cumsum(GF),
         goalsA = cumsum(GA),
         goaldif = cumsum(GD),
         W = cumsum(W),
         D = cumsum(D),
         L = cumsum(L)
  ) %>% 
  mutate(score = W*3 + D) %>% 
  arrange(week, desc(score),desc(goaldif), desc(goalsF)) %>%
  group_by(Season, week) %>%
  mutate(rank = row_number(-score)) -> laliga_weekly

Now, I will proceed to answer the following questions by making slight changes to the data frames mentioned above.


Q1: Total number of wins for each champion

Having the laliga_fullseason data frame, I only need to chose the teams that ranked first in each season, group the data frame by those teams, and arrange them based on total number of wins.

laliga_fullseason %>%
  filter(rank == 1) %>%
  group_by(team) %>%
  summarise(totwins = n()) %>%
  arrange(desc(totwins)) -> laliga_winners

laliga_winners$team <- factor(laliga_winners$team, levels=laliga_winners$team[order(laliga_winners$totwins)])

ggplot2:

ggplot(data = laliga_winners, aes(x = team, y = totwins)) + 
  geom_bar(stat = "identity", aes(fill = team)) +  
  geom_label(aes(label = paste("Number of wins:", totwins), color = team), 
            size = 2, fontface = "bold.italic") +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + 
  scale_fill_brewer(palette = 4) + 
  scale_color_brewer(palette = 4)

highcharter:

laliga_winners %>%
hchart(type = "column",
       hcaes(x = team, y = totwins),
       color = "darkmagenta",
       name = "Title Wins") %>%
  hc_yAxis(title = list(text = "Number of Title Wins")) %>%
  hc_xAxis(title = list(text = "Clubs"), 
           opposite = TRUE, 
           tickColor = "darkmagenta", 
           tickLength = 3, 
           tickWidth = 1.5) %>%
  hc_title(text = "Top 10 Most Title Wins") %>%
  hc_subtitle(text = "Comparing Tital Sins of LaLiga Clubs") %>%
  hc_add_theme(hc_theme_google())

Q2: The Most Boring League Seasons and Teams

The Most Boring League Seasons

I’ve summarised each season with three colomns: 1. number of total goals = A 2. number of games with no goals in halftime = B 3. number of games with 0-0 final results = C 4. number of games with 1-1 final results = D I’ve computet the boringness of each league by this equation:

\[boringness = \frac{10 * C + 7 * B + 5 * D - A}{10}\]

Afterwards, I sort them in descending order based of this value.

library(stringr)
as.tbl(spain) %>%
  mutate(totGF = hgoal + vgoal, 
         totGH = as.numeric(str_sub(HT, start = 1, end = 1)) + 
           as.numeric(str_sub(HT, start = 3, end = 3)), 
         FHdif = totGF - totGH) %>%
  group_by(Season) %>%
  summarise(goal = sum(totGF) / n(), 
            nogoal = sum(hgoal + vgoal == 0) / n(), 
            draws = sum(hgoal == vgoal & hgoal + vgoal == 2) / n(), 
            nogoalht = sum(FHdif == 0) / n()) %>%
  mutate(boringness = (-1 * goal + 10 * nogoal + 7 * nogoalht + 5 * draws) / 10)%>%
  arrange(desc(boringness)) %>%
  slice(1:10) -> boring_season

ggplot2:

boring_season$Season <- factor(boring_season$Season, levels=boring_season$Season[order(boring_season$boringness)])

##TODO FIX THIS

ggplot(data = boring_season, aes(x = Season)) + 
  geom_point(aes(y = boringness, color = "Boringness(total)"), size = 2) + 
  geom_line(aes(y = boringness, color = "Boringness(total)"), group = 1, size = 2) + 
  geom_point(aes(y = goal / 10, color = "Goals")) + 
  geom_line(aes(y = goal / 10, color = "Goals"), group = 1) + 
  geom_point(aes(y = nogoal, color = "0-0 Results")) + 
  geom_line(aes(y = nogoal, color = "0-0 Results"), group = 1) + 
  geom_point(aes(y = draws, color = "1-1 Results")) + 
  geom_line(aes(y = draws, color = "1-1 Results"), group = 1) + 
  geom_point(aes(y = nogoalht, color = "No Goals in Halftime")) + 
  geom_line(aes(y = nogoalht, color = "No Goals in Halftime"), group = 1) + 
  theme(legend.title = element_blank())

highcharter:

boring_season %>%
  hchart(type = "column",
         hcaes(x = as.character(Season), y = round(boringness, 3)), 
         name = "Boringness", 
         dataLabels = list(enabled = T)) %>%
  hc_add_series(data = round(boring_season$nogoal, 2), name = "0-0 results", 
                dataLabels = list(enabled = T)) %>%
  hc_add_series(data = round(boring_season$draws, 2), name = "1-1 results",
                dataLabels = list(enabled = T)) %>%
  hc_add_series(data = round(boring_season$goal/ 10, 2), name = "goals", 
                dataLabels = list(enabled = T)) %>%
  hc_add_series(data = round(boring_season$nogoalht, 2), name = "No Goals in ht", 
                dataLabels = list(enabled = T)) %>%
  hc_yAxis(title = list(text = "Boringness")) %>%
  hc_xAxis(title = list(text = "Season")) %>%
  hc_title(text = "Top 10 Most Boring Season") %>% 
  hc_subtitle(text = "Based on goals, 0-0 or 1-1 results and games with no goals in halftime") %>%
  hc_tooltip(valueDecimals = 3) %>%
  hc_add_theme(hc_theme_google())

The Most Boring Teams

I’ve summarised each team by these columns: 1. mean of all the goals = A 2. mean of all the games with no goals = B 3. mean of all games with draw results = C Then I calculate boringness of each team by this equation:

\[boringness = \frac{10 * B + 5 * C - A}{10}\]

Afterwards, I again sort the games in descending order based on this value.

rbind(
  spain %>%
    select(Season, team = home, 
           opp = visitor, GF = hgoal, GA = vgoal),
  spain %>%
    select(Season, team = visitor, 
           opp = home, GF = vgoal, GA = hgoal)
) %>% 
  group_by(team) %>%
  summarise(goal = sum(GF) / n(), 
            nogoal = sum(GF == 0) / n(), 
            draw = sum(GF == GA) / n())%>%
  mutate(boringness = (-1 * goal + 10 * nogoal + 5 * draw) /10) %>%
  arrange(desc(boringness)) %>%
  slice(1:10) -> boring_team

ggplot:

boring_team$team = factor(boring_team$team, levels=boring_team$team[order(boring_team$boringness)])

ggplot(data = boring_team, aes(x = team)) + 
  geom_point(aes(y = boringness, color = "Boringness(total)"), size = 2) + 
  geom_line(aes(y = boringness, color = "Boringness(total)"), group = 1, size = 2) + 
  geom_point(aes(y = goal / 10, color = "Goals")) + 
  geom_line(aes(y = goal / 10, color = "Goals"), group = 1) + 
  geom_point(aes(y = nogoal, color = "no goals scored")) + 
  geom_line(aes(y = nogoal, color = "no goals scored"), group = 1) + 
  geom_point(aes(y = draw, color = "draw")) + 
  geom_line(aes(y = draw, color = "draw"), group = 1) + 
  theme(legend.title = element_blank(), 
        axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) 

highcharter:

boring_team %>%
  hchart(type = "column",
         hcaes(x = team, y = round(boringness, 3)), 
         name = "Boringness", 
         dataLabels = list(enabled = T)) %>%
  hc_add_series(data = round(boring_team$nogoal, 2), name = "no goals scored", 
                dataLabels = list(enabled = T)) %>%
  hc_add_series(data = round(boring_team$draw, 2), name = "draw", 
                dataLabels = list(enabled = T)) %>%
  hc_add_series(data = round(boring_team$goal, 2) / 10, name = "goals", 
                dataLabels = list(enabled = T)) %>%
  hc_yAxis(title = list(text = "Boringness")) %>%
  hc_xAxis(title = list(text = "Team")) %>%
  hc_title(text = "Top 10 Most Boring Teams") %>% 
  hc_subtitle(text = "Based on goals, games with no goal scored, and games with draw results.") %>%
  hc_tooltip(valueDecimals = 3) %>%
  hc_add_theme(hc_theme_google())

Q3: How many times has the half season winner won the league?

I’ve used the laliga_halfseason and laliga_fullseason data frames I’ve built in the beginning. From both data frames I’ve chosen the teams that have ranked first. Then I’ve used the full_join() function.

laliga_halfseason %>%
  filter(rank == 1) %>%
  select(Season, halfwinner = team) -> laliga_winner_hs

laliga_fullseason %>%
  filter(rank == 1) %>%
  select(Season, fullwinner = team) -> laliga_winner_fs

hs_vs_fs <- full_join(laliga_winner_hs, laliga_winner_fs)

gplot2:

ggplot(data = hs_vs_fs) + 
  geom_label(aes(x = halfwinner, y= fullwinner, 
                 fill = (halfwinner == fullwinner), 
                 label = Season), 
             size = 3, 
             label.r = unit(0, "lines"), 
             color = "white") +
  scale_fill_manual(labels = c("half season winner didn't win the league", 
                               "half season winner won the league"), 
                    values = c("TRUE"= "green", 
                               "FALSE"= "red")) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1), 
        legend.title = element_blank(), 
        legend.background = element_blank(), 
        legend.key = element_blank(), 
        legend.position = "top") + 
  xlab("Half Winner") +
  ylab("Full Winner")

highcharter:

hchart(hs_vs_fs, "column", 
       hcaes(x = fullwinner, 
             y = Season, 
             group = (fullwinner == halfwinner))) %>%
  hc_title(text = "Was the Title Winner also the Half winner?") %>% 
  hc_subtitle(text = "Red Line: yes, Blue Line: No") %>%
  hc_yAxis(title = list(text = "Title Winner")) %>%
  hc_add_theme(hc_theme_google())
hs_vs_fs %>%
  mutate(equal = ifelse(halfwinner == fullwinner, 
                        "Half Season Winner <br> Won The League", 
                        "Half Season Winner <br> Didn't Win The League")) %>%
  group_by(equal) %>%
  summarise(count = n())-> piestat

piestat %>%
  hchart(type = "pie", 
         hcaes(x = equal, y = round(count / sum(count), 2))) %>%
  hc_title(text = "Comparison of halfseason winner and league winner") %>%
  hc_add_theme(hc_theme_538())
per <- nrow(hs_vs_fs %>%
              filter(halfwinner == fullwinner)) / 
  nrow(hs_vs_fs)
per <- unname(per)
per <- unlist(per)
library(scales)
show(percent(per))
[1] "52.3%"

  Q4: Between the years 2001 and 2010, which teams were the “Black Cats” of big teams?

I’ve used the laliga_fullseason data frame and chosen the four teams with the most average number of championships.

laliga_fullseason %>%
  filter(Season %in% seq(2001, 2010)) %>%
  group_by(team) %>%
  summarise(overallrank = mean(rank)) %>%
  arrange(overallrank) %>%
  slice(1:4) %>%
  select(team) %>%
  unlist() %>%
  unname() -> bigteams

Again, I use the laliga_fullseason data frame and choose the teams which ranked in the last quarter. In a new column named info, I’ve kept the weakest teams and the season in which they were the weakest.

laliga_fullseason %>%
  filter(Season %in% seq(2001, 2010)) %>%
  filter(rank > 3 * n() / 4) %>%
  select(Season, team) %>%
  mutate(info = paste(Season, team)) %>%
  ungroup() %>%
  select(info) %>%
  unlist() %>%
  unname() -> weekteams

Then I’ve filtered the games in which these weak teams have won the big teams.

rbind(
  spain %>%
    select(Season, team = home, opp = visitor, GF = hgoal, GA = vgoal),
  spain %>%
    select(Season, team = visitor, opp = home, GF = vgoal, GA = hgoal)
) %>%
  filter(Season %in% seq(2001, 2010)) %>%
  mutate(info = paste(Season, team)) %>%
  filter(info %in% weekteams) %>%
  filter(opp %in% bigteams) %>%
  filter(GF > GA) -> blackcats

blackcats <- blackcats %>%
  select(-info)
Season team opp GF GA
2001 CA Osasuna Sevilla FC 1 0
2001 UD Las Palmas Real Madrid 4 2
2001 Real Zaragoza Real Madrid 2 1
2001 UD Las Palmas Sevilla FC 1 0
2001 CA Osasuna Real Madrid 3 1
2002 Racing Santander Real Madrid 2 0
2002 CD Alaves Sevilla FC 1 0
2002 Rayo Vallecano FC Barcelona 1 0
2002 Racing Santander Sevilla FC 1 0
2002 Racing Santander Valencia CF 2 1
2003 Espanyol Barcelona Sevilla FC 1 0
2003 Racing Santander FC Barcelona 3 0
2003 Espanyol Barcelona Valencia CF 2 1
2003 Real Valladolid Sevilla FC 2 0
2003 Celta Vigo FC Barcelona 1 0
2003 Real Murcia Real Madrid 2 1
2004 CD Numancia Sevilla FC 2 1
2004 Racing Santander Valencia CF 1 0
2005 CD Alaves Sevilla FC 2 1
2005 Racing Santander Valencia CF 2 1
2006 Celta Vigo Valencia CF 3 2
2006 Real Betis Valencia CF 2 1
2006 Gimnastic Sevilla FC 1 0
2006 Athletic Bilbao Valencia CF 1 0
2007 Real Zaragoza Sevilla FC 2 0
2007 Real Murcia Valencia CF 1 0
2008 CD Numancia FC Barcelona 1 0
2008 Getafe CF Real Madrid 3 1
2008 Sporting Gijon Sevilla FC 1 0
2008 CD Numancia Valencia CF 2 1
2009 Real Valladolid Sevilla FC 2 1
2010 Hercules CF Sevilla FC 2 0
2010 Getafe CF Sevilla FC 1 0
2001 CA Osasuna FC Barcelona 1 0
2003 Racing Santander Valencia CF 2 1
2003 Celta Vigo Sevilla FC 1 0
2005 Racing Santander Real Madrid 2 1
2006 Celta Vigo Real Madrid 2 1
2008 Sporting Gijon Valencia CF 3 2
2008 Real Betis Sevilla FC 2 1
2008 Getafe CF Sevilla FC 1 0
2009 Racing Santander Sevilla FC 2 1
2010 Hercules CF FC Barcelona 2 0
2010 RCD Mallorca Valencia CF 2 1
2010 RCD Mallorca Sevilla FC 2 1
2010 Getafe CF Sevilla FC 3 1
2010 UD Almeria Sevilla FC 3 1

  Q5: Fastest championship? Best championship?

  Fastest championship

Again, I’ve used the laliga_weekly data frame. I add two new columns. In one I’ve calculated the difference between teams with successive ranks, and in the other the number of weeks left after each week(which is the number of remaining matches for each team). The I’ve chosen the teams that have ranked first with a score difference of more than 3 times the remaining games.

laliga_weekly %>%
  mutate(scoredif = score - lead(score)) %>%
  arrange(Season, week, rank) %>%
  mutate(gamesleft = (teamnum - 1) * 2 - week) %>%
  filter(rank == 1) %>%
  filter(scoredif > gamesleft * 3) %>%
  select(Season, week, team, score, gamesleft) %>%
  group_by(Season) %>%
  filter(gamesleft == max(gamesleft)) %>%
  arrange(desc(gamesleft)) -> fastestwin
show(fastestwin$team[1])
[1] "Real Madrid"
show(fastestwin$Season[1])
[1] 1960
show(fastestwin$gamesleft[1])
[1] 5

ggplot2:

fastestwin %>%
  group_by(team) %>%
  summarise(avg = mean(gamesleft)) %>%
  arrange(desc(avg)) -> fastestwinstat

fastestwinstat$team <- factor(fastestwinstat$team, levels=fastestwinstat$team[order(fastestwinstat$avg)])

ggplot(data = fastestwinstat, aes(x = team, y = avg, color = team)) + 
  geom_point()+ 
  geom_line(group = 1, size = 1.5) + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + 
  scale_color_brewer(palette = 11) + 
  labs(title = "Average of games left after definite win", 
       x = "Team", y = "games left average")

highcharter:

fastestwinstat %>%
  hchart(hcaes(x = team, y = avg, group = avg),
         showInLegend = FALSE, 
         type = "column", 
         name = "mean") %>%
  hc_yAxis(title = list(text = "Games Left After Definite Win")) %>%
  hc_xAxis(title = list(text = "Team")) %>%
  hc_tooltip(valueDecimals = 3) %>%
  hc_add_theme(hc_theme_google())

  Best championship

Using the laliga_fullseason data frame, I’ve added a new column calculating the difference between the scores of two successive teams. Then I’ve filtered the teams that have ranked first and sorted the data based on their score difference.

laliga_fullseason %>%
  mutate(dif = abs(score - lead(score))) %>%
  filter(rank == 1) %>%
  arrange(desc(dif)) -> bestwin
show(bestwin$team[1])
[1] "Real Madrid"
show(bestwin$Season[1])
[1] 1962
show(bestwin$dif[1])
[1] 21

ggplot2:

bestwin %>%
  group_by(team) %>%
  summarise(avg = mean(dif)) %>%
  arrange(desc(avg)) -> bestwinstat

bestwinstat$team <- factor(bestwinstat$team, levels=bestwinstat$team[order(bestwinstat$avg)])

ggplot(data = bestwinstat, aes(x = team, y = avg, color = team)) + 
  geom_point()+ 
  geom_line(group = 1, size = 1.5) + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + 
  scale_color_brewer(palette = 15) + 
  labs(title = "Average of score difference in champion and runnerup", 
       x = "Team", y = "Score Difference")

highcharter:

bestwinstat %>%
  hchart(hcaes(x = team, y = avg, group = team), 
         type = "column", 
         name = "mean", 
         showInLegend= FALSE) %>%
  hc_yAxis(title = list(text = "Score Difference")) %>%
  hc_xAxis(title = list(text = "Team")) %>%
  hc_tooltip(valueDecimals = 3) %>%
  hc_add_theme(hc_theme_google())

***

  Q6: Longest winning/losing/draw streak?

I’ve used the laliga_result data frame and added two new columns two it. One states the result (Win, Draw or Lose), the other states the consecutive wins/draws/loses until then.

laliga_result <- rbind(
  spain %>%
    select(Date, Season, team = home, opp = visitor, GF = hgoal, GA = vgoal),
  spain %>%
    select(Date, Season, team = visitor, opp = home, GF = vgoal, GA = hgoal)
) %>%
  mutate(GD = GF-GA) %>%
  mutate(result = ifelse(GD > 0, "Win", ifelse(GD == 0, "Draw", "Lose"))) %>%
  group_by(team) %>%
  arrange(team, Date)

laliga_result$streak <- sequence(rle(as.character(laliga_result$result))$lengths)

Here I’ve filtered the teams with the most streaks:

laliga_result %>% 
  arrange(result, desc(streak)) %>%
  group_by(result) %>%
  filter(streak == max(streak)) %>%
  select(team, result, streak) -> streakstat


show(streakstat %>%
       filter(result == "Win"))
# A tibble: 2 x 3
# Groups:   result [1]
  team         result streak
  <chr>        <chr>   <int>
1 FC Barcelona Win        16
2 Real Madrid  Win        16
show(streakstat %>%
       filter(result == "Draw"))
# A tibble: 1 x 3
# Groups:   result [1]
  team      result streak
  <chr>     <chr>   <int>
1 Burgos CF Draw        9
show(streakstat %>%
       filter(result == "Lose"))
# A tibble: 2 x 3
# Groups:   result [1]
  team             result streak
  <chr>            <chr>   <int>
1 Racing Santander Lose       11
2 UD Las Palmas    Lose       11
streakstat %>%
  hchart(type = "bar", 
         hcaes(x = result, y = streak, group = team)) %>%
  hc_add_theme(hc_theme_gridlight())

To draw plots, I’ve seperated laliga_results into three plots, one for each Win , Lose or Draw result.

  Winning Streaks

laliga_result %>%
  filter(result == "Win") %>%
  group_by(team) %>%
  summarise(avg = mean(streak)) %>%
  arrange(desc(avg)) -> winstreaks
winstreaks <- winstreaks[1:10, ]

ggplot2:

winstreaks$team <- factor(winstreaks$team, levels=winstreaks$team[order(winstreaks$avg)])

ggplot(data = winstreaks, aes(x = team, y = avg, fill = team)) + 
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + 
  scale_fill_brewer(palette = 9) 

highcharter:

thm <- hc_theme_merge(
  hc_theme_darkunica(),
  hc_theme(
    chart = list(
      backgroundColor = "transparent",
      divBackgroundImage = "http://cdn.wall-pix.net/albums/art-3Dview/00025095.jpg"
    ),
    title = list(
      style = list(
        color = 'white',
        fontFamily = "Open Sans"
      )
    )
  )
)

winstreaks %>%
  hchart(hcaes(x = team, y = avg, group = team), showInLegend = F, 
         type = "column", name = "mean") %>%
  hc_yAxis(title = list(text = "Winning Streak")) %>%
  hc_xAxis(title = list(text = "Team")) %>%
  hc_tooltip(valueDecimals = 3) %>%
  hc_add_theme(thm)

  Losing Streaks

laliga_result %>%
  filter(result == "Lose") %>%
  group_by(team) %>%
  summarise(avg = mean(streak)) %>%
  arrange(desc(avg)) -> losestreaks
losestreaks <- losestreaks[1:10, ]

ggplot2:

losestreaks$team <- factor(losestreaks$team, levels=losestreaks$team[order(losestreaks$avg)])

ggplot(data = losestreaks, aes(x = team, y = avg, color = team)) + 
  geom_point(size = 1.5) +
  geom_line(group = 1, size = 1.5) + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + 
    scale_color_brewer(palette = 12) 

highcharter:

losestreaks %>%
  hchart(hcaes(x = team, y = avg, group = team), 
         showInLegend = F, type = "column", name = "mean") %>%
  hc_yAxis(title = list(text = "Losing Streak")) %>%
  hc_xAxis(title = list(text = "Team")) %>%
  hc_tooltip(valueDecimals = 3) %>%
  hc_add_theme(thm)

  Draw Streaks

laliga_result %>%
  filter(result == "Draw") %>%
  group_by(team) %>%
  summarise(avg = mean(streak)) %>%
  arrange(desc(avg)) -> drawstreaks
drawstreaks <- drawstreaks[1:10, ]

ggplot2:

drawstreaks$team <- factor(drawstreaks$team, levels=drawstreaks$team[order(drawstreaks$avg)])


ggplot(data = drawstreaks, aes(x = team, y = avg, color = team)) + 
  geom_point(size = 1.5) +
  geom_line(group = 1, size = 1.5) + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + 
    scale_color_brewer(palette = 8) 

highcharter:

drawstreaks %>%
  hchart(hcaes(x = team, y = avg, group = team), 
         showInLegend = F, type = "column", name = "mean") %>%
  hc_yAxis(title = list(text = "Draw Streak")) %>%
  hc_xAxis(title = list(text = "Team")) %>%
  hc_tooltip(valueDecimals = 3) %>%
  hc_add_theme(thm)

  Q7: Fastest fall?

Once again, I’ve made the same changes as Question 5 to the laliga_weekly data frame. The teams I’ve chosen are the teams that have either ranked last, or fourth to last. Again, I’ve used the laliga_weekly data frame. I add two new columns. In one I’ve calculated the difference between teams with successive ranks, and in the other the number of weeks left after each week(which is the number of remaining matches for each team). The I’ve chosen the teams that have ranked first with a score difference of more than 3 times the remaining games. Then I’ve thrown out the ones that have a score difference of less than 3 times the number of games left. Finally, I arranged the remaining teams in descending order based on the number of games left. (The team with the most number of games left has had the fastest lose in the history of LaLiga)

laliga_weekly %>%
  filter(rank == teamnum | rank == teamnum - 3) %>%
  arrange(Season, week, rank) %>%
  mutate(scoredif = lag(score) - score, 
         gamesleft = (teamnum - 1) * 2 - week)  %>%
  select(Season, week, Date, team, score, rank, scoredif, teamnum, gamesleft) %>%
  arrange(Season, week) %>%
  filter(scoredif > gamesleft * 3) %>%
  group_by(Season) %>%
  filter(gamesleft == max(gamesleft)) %>%
  ungroup() %>%
  arrange(desc(gamesleft)) %>%
  mutate(order = row_number(gamesleft)) -> fastestlose

fastestlose <- fastestlose[1:10, ]

ggplot2:

fastestlose$team <- factor(fastestlose$team, levels=fastestlose$team[order(fastestlose$gamesleft)])


ggplot(data = fastestlose, aes(x = team, y = gamesleft, fill = team)) + 
  geom_bar(stat = "identity")+
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + 
    scale_fill_brewer(palette = 3) 

highcharter:

fastestlose %>%
  hchart(hcaes(x = team, y = gamesleft, group = team), 
         showInLegend = FALSE, type = "column", name = "mean") %>%
  hc_yAxis(title = list(text = "Games Left")) %>%
  hc_xAxis(title = list(text = "Team")) %>%
  hc_tooltip(valueDecimals = 3) %>%
  hc_add_theme(hc_theme_google())

  Q8: Team standings during 1998 Season.

Once again, simply by using the laliga_weekly data frame, I’ve computed the results at the end of each week in 1998. I’ve also added a new column to account for the way the dates are shown in the sample picture.

laliga_weekly %>%
  filter(Season == 1998) %>%
  select(Season, week, Date, team, opp, score, rank) -> laliga1998

laliga1998 %>%
  mutate(Date1 = Date) %>%
  separate(Date, c("year", "month_number", "day"), "-") %>%
  mutate(Date = Date1) %>%
  select(-Date1) %>%
  select(-year) %>%
  select(-month_number) -> laliga1998

laliga1998$month = str_sub(months(as.Date(laliga1998$Date)), start = 1, end =3)

laliga1998 %>%
  mutate(month_date = paste(month, as.numeric(day))) %>%
  group_by(week) %>%
  arrange(Date) %>%
  mutate(month_date = month_date[which.max(Date)], 
         Date = max(Date)) %>%
  ungroup() %>%
  arrange(week, rank) %>%
  mutate(dateorder = row_number(week)) -> laliga1998

dates <- laliga1998 %>%
  select(month_date) %>%
  distinct() %>%
  unlist() %>%
  unname()

teams <- laliga1998 %>%
  ungroup() %>%
  filter(week == max(week)) %>%
  arrange(desc(rank)) %>%
  select(team) %>%
  distinct() %>%
  unlist() %>%
  unname()

ggplot2:

ggplot(data = laliga1998, aes(x = month_date, y = rank, color = team)) + 
  geom_point() + 
  geom_line(size = 1, aes(group = team)) + 
  theme_minimal() + 
  scale_color_discrete(breaks = teams) + 
  scale_x_discrete(limit = laliga1998$month_date) +
  guides(color=guide_legend(
                 keywidth=0.5,
                 keyheight=0.235,
                 default.unit="inch")
      ) + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1), 
        legend.title = element_blank()) +
  labs(title = "Team Standings in 2012", 
       x = "Date", y = "Rank")

highcharter:

laliga1998 %>%
  hchart(type = "line",hcaes(x = month_date, y = rank, group = team)) %>%
  hc_legend(align = "right",  
            verticalAlign = "top", 
            layout = "vertical", x = 0, y = 100) %>%
  hc_yAxis(reversed = TRUE)

laliga2012 <- spain %>%
  filter(Season == 2012) %>%
  mutate(GD=hgoal-vgoal,
         result = ifelse(GD > 0, "Home Wins", ifelse(GD<0, "Away Wins", "Draw")))

teamorder <- laliga2012 %>%
  select(team = home) %>%
  distinct() %>%
  arrange(team)

laliga2012$home <- factor(laliga2012$home, levels=rev(teamorder$team))
laliga2012$visitor <- factor(laliga2012$visitor, levels=teamorder$team)

ggplot(laliga2012, aes(as.factor(home), as.factor(visitor))) + 
  geom_tile(aes(fill = result), 
            colour= "white", 
            size = 0.5, 
            stat = "identity", 
            height = 1, width = 1) +
  geom_text(aes(label = FT), 
            color="white", 
            size=rel(4.2), 
            fontface = "bold") +
  theme_minimal() +
  theme(
    plot.background = element_rect(fill="black"),
    panel.background = element_rect(fill="black"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_rect(fill=NA, color="white", size=0.5, linetype="solid"),
    axis.line = element_blank(),
    axis.ticks = element_blank(), 
    axis.text.x  = element_text(angle=45, vjust=0.5, hjust=0),
    axis.title.x = element_text(color = "white", size = rel(2.5)), 
    axis.title.y = element_text(color = "white", size = rel(2.5)),
    axis.text = element_text(color="white", size=rel(1), face = "bold"),
    legend.title = element_text(color="white", size=rel(1.5), face = "bold.italic"), 
    legend.text = element_text(color="white", size=rel(1), face = "bold"), 
    legend.key = element_blank(),
    legend.background = element_blank()
  ) + 
  scale_x_discrete(expand = c(0, 0)) +
  scale_y_discrete(expand = c(0, 0), 
                   position = "top") +
  coord_flip() +
  xlab("Home") + 
  ylab("Away") +
  guides(fill = guide_legend(title="Result", 
                             default.unit="inch", 
                             title.position = "top", 
                             title.hjust = 0.5,
                             title.vjust = 0.5,
                             label.hjust = 0.5)) + 
  scale_fill_manual(labels = c("Away Wins", 
                               "Draw", 
                               "Home Wins"), 
                    values = c("Away Wins"= "darkgoldenrod3", 
                               "Draw"= "darkgrey", 
                               "Home Wins" = "darkgreen"))


  Q10: Three interesting statistics.

  Zero to Hero

In this section, I’ve found the teams with the best comebacks. Meaning, the teams with the most change in ranks between halfseason and final results.

laliga_complete <- rbind(laliga_halfseason, 
                         laliga_fullseason) %>% arrange(Season)

rankchangestat <- laliga_complete %>%
  group_by(Season, team) %>%
  mutate(dif = lead(rank) - rank) %>%
  mutate(rankchange = paste(rank, lead(rank), sep = "-")) %>%
  filter(!is.na(dif)) %>%
  select(Season, team, rankchange, dif) %>%
  arrange(Season, team) 

biggestcomebacks <- rankchangestat %>%
  arrange(dif)
biggestcomebacks <- biggestcomebacks[1:10, ]
biggestcomebacks <- biggestcomebacks %>%
  mutate(dif = abs(dif))

biggestcomebacks$team <- factor(biggestcomebacks$team, levels=biggestcomebacks$team[order(biggestcomebacks$dif)])

ggplot2:

ggplot(data = biggestcomebacks, aes(x = team, y = dif, color = Season)) + 
  geom_point(size = 4) + 
  geom_line(group = 1, size = 2) +  
  geom_label(aes(label = Season)) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) + 
  labs(title = "Biggest Comeback", 
       x = "team", y = "Rank Change") + 
  scale_y_discrete(limit = biggestcomebacks$dif)

highcharter:

biggestcomebacks %>%
hchart(type = "column",
       hcaes(x = team, y = dif), 
       name = "rank change", 
       color = "deeppink") %>%
  hc_add_theme(hc_theme_google())

  2

  Just Out of Reach

In this section, I’ve arranged the runner-ups of each year based on the difference in their score with the champion’s and plotted the top 10 least differences.

laliga_runnerup <- laliga_fullseason %>%
  filter(rank == 2 | rank == 1) %>%
  mutate(scoredif = lag(score) - score) %>%
  filter(scoredif != 0) %>%
  arrange(scoredif) 

ggplot2:

ggplot(data = laliga_runnerup, aes(x = team, y = scoredif))  + 
  geom_label(aes(label = Season, 
                  fill = as.character(row_number(scoredif) %/% 10 + 1))) + 
  scale_fill_brewer(palette = 10) +
  scale_y_reverse() +
  guides(fill = guide_legend(title = "Rank of Score Difference")) + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))

highcharter:

laliga_runnerup[1:20, ] %>%
  hchart(type = "column", 
         hcaes(x = team, y = scoredif, group = Season))

  The Verge of The Tipping Point

In this section, I’ve found the fewest points while surviving relegation.

laliga_relagation_survive <- laliga_fullseason %>%
  group_by(Season) %>%
  mutate(teamnum = n_distinct(team)) %>%
  filter(rank == teamnum - 3 | rank == teamnum - 2) %>%
  mutate(scoredif = score - lead(score)) %>%
  filter(rank == teamnum - 3 & scoredif != 0) %>%
  group_by(scoredif) %>%
  summarise(num = n()) %>%
  arrange(desc(num))

ggplot2:

ggplot(data = laliga_relagation_survive, aes(x = scoredif, y = num, color = num)) + 
  geom_point(size = 2) + 
  geom_line(group = 1, size = 1.5) + 
  theme(legend.title =element_blank())

highcharter:

laliga_relagation_survive %>%
  hchart(type = "column", 
         hcaes(x = scoredif, y = num)) %>%
  hc_add_theme(thm)

  The Ultimate Fall From Grace

In this section, I’ve computed the worst defeat by a winner in the season after its championship.

laliga_weekly %>%
  ungroup() %>%
  arrange(Date, team) %>%
  group_by(Season) %>%
  mutate(winner = team[which.max(score)]) %>% 
  ungroup() %>%
  arrange(Date) %>%
  mutate(winner = lag(winner)) %>%
  group_by(Season) %>%
  arrange(Date) %>%
  mutate(prevwinner = winner[which.min(row_number(Date))]) -> history

history %>%
  group_by(Season) %>%
  filter(team == prevwinner) %>%
  select(Season, team, opp, GA, GF, FT, GD, prevwinner) %>%
  arrange(GD)-> shame 
shame <- shame[1:15, ]
shame <- shame %>%
  mutate(GD = abs(GD)) %>%
  select(Season, team, GD)

ggplot2:

ggplot(data = shame, aes(x = team, y = GD, fill = as.character(Season), show.legend = F), show.legend = FALSE) + 
  geom_label(aes(label = Season)) +  
  scale_fill_brewer(palette = "Spectral")  

highcharter:

shame %>%
  hchart(type = "bar", 
         hcaes(x = team, y = GD, group = Season))